library(tidyverse) # Colección de paquetes para ciencia de datos (incluye ggplot2, dplyr, tidyr, readr, purrr, tibble)
library(reshape) # Herramientas para reorganizar datos
library(Hmisc) # resúmenes estadÃsticos
library(limma) # Análisis de datos de expresión genética
library(AnnotationDbi) # Interfaz para bases de datos de anotaciones bioinformáticas
library(org.Hs.eg.db) # Datos de anotación para genes humanos
library(VennDiagram) # Generación de diagramas de Venn
library(gridExtra) # Mostrar varias gráficas
library(patchwork) # Combinar múltiples ggplots en un único plot
library(ggrepel) # Mejora la visualización de texto en ggplots evitando solapamientos de texto
library(Rtsne) # Implementación de t-SNE
library(umap) # Implementación de UMAP
library(ggVennDiagram) # diagrama de VennDescargar los datos
# covariables
ROSMAP_RINPMIAGESEX_covs <- readRDS("~/Library/CloudStorage/OneDrive-UNIVERSIDADDEMURCIA/Documentos/Fernando/Master Bioinformatica/TFM/datos/ROSMAP_RINPMIAGESEX_covs.rds")
covs <- ROSMAP_RINPMIAGESEX_covs
rownames(covs) <- covs$mrna_id
covs$study <- as.factor(covs$study)
covs$projid <- as.character(covs$projid)
covs$ceradsc <- as.factor(covs$ceradsc)
covs$cogdx <- as.factor(covs$cogdx)
covs$neuroStatus <- as.factor(covs$neuroStatus)
#datos corregidos
ROSMAP_RINPMIAGESEX_resids <- readRDS("~/Library/CloudStorage/OneDrive-UNIVERSIDADDEMURCIA/Documentos/Fernando/Master Bioinformatica/TFM/datos/ROSMAP_RINPMIAGESEX_resids.rds")
mat_exp <- ROSMAP_RINPMIAGESEX_resids# geneset de Alzheimer extraidos de KEGG
tab <- getGeneKEGGLinks(species="hsa")
tab$Symbol <- mapIds(org.Hs.eg.db, tab$GeneID,
column="SYMBOL", keytype="ENTREZID")
paths <- getKEGGPathwayNames(species="hsa")
paths[paths$PathwayID=="hsa05010", ]Vamos a seleccionar solamente los genes de Alzheimer de KEGG en nuestro dataset de expresión génica.
Visualizamos si se han seleccionado todos los genes y cuantos del geneset de KEGG
venn.plot <- venn.diagram(
x = list(GenesMatriz = colnames(mat_exp), GenesetAlzheimer = geneset_alz),
category.names = c("Matrix Genes", "Geneset Alzheimer"),
filename = NULL,
output = FALSE, # Asegura que no se exporte a un archivo
fill = c("#00CED1", "#E9967A"),
cex = 1, # Aumenta el tamaño del texto
fontface = "bold",
cat.cex = 1, # Aumenta el tamaño del texto de las categorÃas
cat.fontface = "bold",
cat.default.pos = "text",
cat.pos = 25, #posicion de las categoricas
cat.dist = 0.1, #distancia de las categoricas
rotation.degree = 0,
margin = 0.1, # hacerla más pequeña
lwd=0.5,
lty = "dashed", # Estilo de lÃnea discontinua
edge.col = "grey", # Color de los bordes
main = "Alzheimer genes in exp matrix",
main.fontface= "bold",
main.cex = 2,
main.pos = c(0.5, 1)
)
grid.newpage() # Asegura que el lienzo esté limpio
grid.draw(venn.plot)# Scree plot con los datos escalados
var_exp <- pca_alz_genes_scaled$sdev^2
prop_var_exp <- var_exp / sum(var_exp)
cum_var_exp <- cumsum(prop_var_exp)
df_var_exp <- data.frame(Comp = 1:length(prop_var_exp), VarExp = prop_var_exp)
df_cum_var_exp <- data.frame(Comp = 1:length(cum_var_exp), CumVarExp = cum_var_exp)
ggplot(df_var_exp[1:20,], aes(x = Comp, y = VarExp)) +
geom_bar(stat = "identity", fill = "skyblue") +
geom_line(aes(group = 1), color = "blue") +
geom_point(color = "blue") +
theme_minimal() +
labs(x = "Principal components", y = "Variance", title = "Scree Plot scaled") +
ylim(c(0,1)) +
geom_line(data = df_cum_var_exp[1:20,], aes(x = Comp, y = CumVarExp), color="#8B1A1A") +
geom_point(data = df_cum_var_exp[1:20,], aes(x = Comp, y = CumVarExp), color = "red") +
geom_bar(data = df_cum_var_exp[1:20,], aes(x = Comp, y = CumVarExp), stat = "identity", fill = "red", alpha= 0.25) +
annotate("text", x = 4, y = 0.85, label = "Cumulative Scree Plot", color = "#8B1A1A", size = 4) +
geom_text(data = df_cum_var_exp[seq(0,20,2),], aes(x=Comp, y = CumVarExp +0.04, label = round(CumVarExp, 2)))Vemos que con las 20 primeras no encontramos ni siquiera el 80% de la varianza.
outlierspc1_scaled <- as.data.frame(pca_alz_genes_scaled$x[abs(pca_alz_genes_scaled$x[,1]) > 2*(pca_alz_genes_scaled$sdev[1]),1])
outlierspc2_scaled <- as.data.frame(pca_alz_genes_scaled$x[abs(pca_alz_genes_scaled$x[,2]) > 2*(pca_alz_genes_scaled$sdev[2]),2])df <- as.data.frame(pca_alz_genes_scaled$x)
plot1 <- ggplot(df, aes(x = PC1)) +
geom_density(fill = "#00CED1", alpha = 0.5) +
geom_vline(xintercept = mean(df$PC1) + 2*pca_alz_genes_scaled$sdev[1], linetype = "dashed", color = "blue") +
geom_vline(xintercept = mean(df$PC1) - 2*pca_alz_genes_scaled$sdev[1], linetype = "dashed", color = "blue") +
labs(title = "PC1 density with 2*SD scaled PCA") +
geom_text(data = outlierspc1_scaled,
aes(x = outlierspc1_scaled[,1], y= 0, label = rownames(outlierspc1_scaled)),
vjust = 1.5, hjust = 0, size = 3, color = "#E9965A", angle = 90, fontface= "bold") +
geom_rug(data = as.data.frame(pca_alz_genes_scaled$x), aes(x= PC1, y = 0), color= ifelse(abs(pca_alz_genes_scaled$x[,1]) > 2*(pca_alz_genes_scaled$sdev[1]), "#8B1A1A", "grey")) +
theme_minimal()
plot2 <- ggplot(df, aes(x = PC2)) +
geom_density(fill = "#00CED1", alpha = 0.5) +
geom_vline(xintercept = mean(df$PC2) + 2*pca_alz_genes_scaled$sdev[2], linetype = "dashed", color = "blue") +
geom_vline(xintercept = mean(df$PC2) - 2*pca_alz_genes_scaled$sdev[2], linetype = "dashed", color = "blue") +
labs(title = "PC2 density with 2*SD scaled PCA") +
geom_text(data = outlierspc2_scaled,
aes(x = outlierspc2_scaled[,1], y= 0, label = rownames(outlierspc2_scaled)),
vjust = 1.5, hjust = 0, size = 3, color = "#E9965A", angle = 90, fontface= "bold") +
geom_rug(data = as.data.frame(pca_alz_genes_scaled$x), aes(x= PC2, y = 0), color= ifelse(abs(pca_alz_genes_scaled$x[,2]) > 2*(pca_alz_genes_scaled$sdev[2]), "#8B1A1A", "grey")) +
theme_minimal()
grid.arrange(plot1, plot2, ncol = 1)mPC1_scaled <- mat_exp[rownames(mat_exp) %in% rownames(outlierspc1_scaled),]
mPC2_scaled <- mat_exp[rownames(mat_exp) %in% rownames(outlierspc2_scaled),]
mPC12_scaled <- mat_exp[(rownames(mat_exp) %in% rownames(outlierspc2_scaled)) & (rownames(mat_exp) %in% rownames(outlierspc1_scaled)),]
not_in_PC12_scaled <- mat_exp[!((rownames(mat_exp) %in% rownames(outlierspc2_scaled)) | (rownames(mat_exp) %in% rownames(outlierspc1_scaled))),]# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mPC1_scaled)), length(rownames(mPC2_scaled)), length(rownames(mPC12_scaled)))
# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_mPC1_scaled <- c(rownames(mPC1_scaled), rep("", max_length - length(rownames(mPC1_scaled))))
rownames_mPC2_scaled <- c(rownames(mPC2_scaled), rep("", max_length - length(rownames(mPC2_scaled))))
rownames_mPC12_scaled <- c(rownames(mPC12_scaled), rep("", max_length - length(rownames(mPC12_scaled))))
final_table <- data.frame(
mPC1 = rownames_mPC1_scaled,
mPC2 = rownames_mPC2_scaled,
mPC1_and_PC2 = rownames_mPC12_scaled
)
final_tableVoy ahora a ver de esas muestras seleccionadas, qué covariables están asociadas con esas muestras outliers
rownames(covs) <- covs$mrna_id
covs2 <- covs
for (i in rownames(covs2)){
if (i %in% rownames(mPC1_scaled) & !(i %in% rownames(mPC12_scaled))){
covs2[i, "sampleset_PCA"] <- "mPC1"
}
if (i %in% rownames(mPC2_scaled) & !(i %in% rownames(mPC12_scaled))){
covs2[i, "sampleset_PCA"] <- "mPC2"
}
if (i %in% rownames(mPC12_scaled)){
covs2[i, "sampleset_PCA"] <- "mPC1 and mPC2"
}
if (!(i %in% rownames(mPC1_scaled)) & !(i %in% rownames(mPC2_scaled))){
covs2[i, "sampleset_PCA"] <- "Not in both"
}
}
covs2$sampleset_PCA <- as.factor(covs2$sampleset_PCA)
data.frame(table(covs2$sampleset_PCA))pca.data.kegg <- data.frame(pca_alz_genes_scaled$x)[,1:2]
colnames(pca.data.kegg) <- c("PC1.KEGG", "PC2.KEGG")
covs2 <- merge(covs2, pca.data.kegg, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULLplots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
col_name <- colnames(covs2)[i]
if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric"){
next
}
if (class(covs2[[i]]) == "factor"){
p <- covs2 %>%
group_by(across(all_of(col_name)), sampleset_PCA) %>%
summarise(count = n(), .groups = 'drop') %>%
mutate(total = sum(count), percentage = (count / total) * 100) %>%
filter(sampleset_PCA == "mPC1" | sampleset_PCA == "mPC2" | sampleset_PCA == "mPC1 and mPC2") %>%
ggplot(aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_PCA")) +
geom_bar(stat="identity", position=position_dodge()) +
labs(x = NULL, y = "%", fill = col_name) +
theme_minimal()
plots[[colnames(covs2)[i]]] <- p
}
}
grid.arrange(plots$study , plots$braaksc, plots$ceradsc, plots$cogdx, plots$apoe_genotype, plots$neuroStatus, ncol=2)plots.pca.kegg <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
# Crea una gráfica para cada variable
p <- ggplot(covs2, aes_string(x = "PC1.KEGG", y = "PC2.KEGG", color = names(covs2)[i])) +
geom_point(show.legend = TRUE) +
geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outlierspc1_scaled) | mrna_id %in% rownames(outlierspc2_scaled), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
color = "black",
max.overlaps = 10, # Reduce el número máximo de solapamientos
point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
size = 3,
fontface = "bold",
segment.size = 0.2, # LÃneas de guÃa más finas
segment.color = 'grey50',
max.segment.length = unit(0.5, "lines"), # LÃneas de guÃa más cortas
arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
theme_minimal() +
labs(title = names(covs2)[i],
x = "PC1.KEGG", y = "PC2.KEGG", color = names(covs2)[i]) +
theme(
legend.title = element_blank(),
legend.text = element_text(size = 12),
text = element_text(size = 12),
legend.key.size = unit(0.5, "cm"),
plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
)
plots.pca.kegg[[colnames(covs2)[i]]] <- p
}
grid.arrange(plots.pca.kegg$study, plots.pca.kegg$braaksc, ncol=1)Parámetros clave en t-SNE: Perplexity (Perplejidad): Este es uno de los parámetros más importantes en t-SNE. La perplejidad se relaciona con el número de vecinos cercanos que t-SNE considera cuando mapea los datos. Un valor demasiado bajo hace que el modelo se concentre demasiado en la estructura local, mientras que un valor demasiado alto puede llevar a una visión global que pierde detalles. La elección óptima depende del tamaño del dataset, pero valores tÃpicos están entre 5 y 50.
Número de iteraciones: t-SNE es un algoritmo iterativo, y el número de iteraciones puede afectar la estabilidad de los resultados. Un número insuficiente de iteraciones puede resultar en una organización incompleta, mientras que demasiadas iteraciones pueden no proporcionar beneficios adicionales y aumentar el tiempo de cálculo.
Tasa de aprendizaje: Este parámetro controla el tamaño del paso en cada actualización durante la optimización. Una tasa de aprendizaje muy baja puede hacer que el algoritmo tarde mucho en converger, mientras que una tasa demasiado alta puede llevar a una convergencia pobre.
set.seed(1234)
tsne <- Rtsne(mat_exp_alz_genes, dims = 2, theta = 0.0)
tsne.data <- as.data.frame(tsne$Y)
row.names(tsne.data) <- row.names(mat_exp_alz_genes)
tsne.data.covs <- merge(tsne.data, covs, by = "row.names")
tsne.data.covs$Row.names <- NULL
row.names(tsne.data.covs) <- tsne.data.covs$mrna_idrownames(tsne.data) <- rownames(mat_exp_alz_genes)
# Calcular la media y la desviación estándar para cada componente de t-SNE
mean.tsne1 <- mean(tsne.data[,1])
sd.tsne1 <- sd(tsne.data[,1])
mean.tsne2 <- mean(tsne.data[,2])
sd.tsne2 <- sd(tsne.data[,2])
# Identificar muestras a más de 2 desviaciones estándar de la media
outliers.tsne1 <- tsne.data[abs(tsne.data[,1] - mean.tsne1) > 2 * sd.tsne1, ]
outliers.tsne2 <-tsne.data[abs(tsne.data[,2] - mean.tsne2) > 2 * sd.tsne2, ]mtSNE1 <- mat_exp[rownames(mat_exp) %in% rownames(outliers.tsne1),]
mtSNE2 <- mat_exp[rownames(mat_exp) %in% rownames(outliers.tsne2),]
mtSNE12 <- mat_exp[(rownames(mat_exp) %in% rownames(outliers.tsne1)) & (rownames(mat_exp) %in% rownames(outliers.tsne2)),]
not_in_tSNE12 <- mat_exp[!((rownames(mat_exp) %in% rownames(outliers.tsne1)) | (rownames(mat_exp) %in% rownames(outliers.tsne2))),]for (i in rownames(covs2)){
if (i %in% rownames(mtSNE1) & !(i %in% rownames(mtSNE12))){
covs2[i, "sampleset_tSNE"] <- "mtSNE1"
}
if (i %in% rownames(mtSNE2) & !(i %in% rownames(mtSNE12))){
covs2[i, "sampleset_tSNE"] <- "mtSNE2"
}
if (i %in% rownames(mtSNE12)){
covs2[i, "sampleset_tSNE"] <- "mtSNE1 and mtSNE2"
}
if (!(i %in% rownames(mtSNE1)) & !(i %in% rownames(mtSNE2))){
covs2[i, "sampleset_tSNE"] <- "Not in both"
}
}
covs2$sampleset_tSNE <- as.factor(covs2$sampleset_tSNE)
data.frame(table(covs2$sampleset_tSNE))# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mtSNE1)), length(rownames(mtSNE2)), length(rownames(mtSNE12)))
# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_tsne1 <- c(rownames(mtSNE1), rep("", max_length - length(rownames(mtSNE1))))
rownames_tsne2 <- c(rownames(mtSNE2), rep("", max_length - length(rownames(mtSNE2))))
rownames_tsne12 <- c(rownames(mtSNE12), rep("", max_length - length(rownames(mtSNE12))))
final_table <- data.frame(
mtSNE1 = rownames_tsne1,
mtSNE2 = rownames_tsne2,
mtSNE1_and_mtSNE2 = rownames_tsne12
)
final_tablecolnames(tsne.data) <- c("tSNE1.KEGG", "tSNE2.KEGG")
covs2 <- merge(covs2, tsne.data, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULLplots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
col_name <- colnames(covs2)[i]
if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric" ){
next
}
if (class(covs2[[i]]) == "factor"){
p <- covs2 %>%
group_by(across(all_of(col_name)), sampleset_tSNE) %>%
summarise(count = n(), .groups = 'drop') %>%
filter(sampleset_tSNE == "mtSNE1" | sampleset_tSNE == "mtSNE2" | sampleset_tSNE == "mtSNE1 and mtSNE2") %>%
mutate(total = sum(count), percentage = (count / total) * 100) %>%
ggplot(aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_tSNE")) +
geom_bar(stat="identity", position=position_dodge()) +
labs(x = NULL, y = "%", fill = col_name) +
theme_minimal()
plots[[colnames(covs2)[i]]] <- p
}
}
grid.arrange(plots$study , plots$braaksc, plots$ceradsc, plots$cogdx, plots$apoe_genotype, plots$neuroStatus, ncol=2)plots.tsne.kegg <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
# Crea una gráfica para cada variable
p <- ggplot(covs2, aes_string(x = "tSNE1.KEGG", y = "tSNE2.KEGG", color = names(covs2)[i])) +
geom_point(show.legend = TRUE) +
geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outliers.tsne1) | mrna_id %in% rownames(outliers.tsne2), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
color = "black",
max.overlaps = 20, # Reduce el número máximo de solapamientos
point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
size = 3,
fontface = "bold",
segment.size = 0.2, # LÃneas de guÃa más finas
segment.color = 'grey50',
max.segment.length = unit(0.5, "lines"), # LÃneas de guÃa más cortas
arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
theme_minimal() +
labs(title = names(covs2)[i],
x = "tSNE1.KEGG", y = "tSNE2.KEGG", color = names(covs2)[i]) +
theme(
legend.title = element_blank(),
legend.text = element_text(size = 12),
text = element_text(size = 12),
legend.key.size = unit(0.5, "cm"),
plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
)
plots.tsne.kegg[[colnames(covs2)[i]]] <- p
}
grid.arrange(plots.tsne.kegg$study, plots.tsne.kegg$braaksc, ncol=1)local.config <- umap.defaults
# local.config$n_neighbors <- 4
# local.config$n_components <- 2
# local.config$n_epochs <- 100
# local.config$metric<- "euclidean"
set.seed(1234)
umap.ad <- umap(mat_exp_alz_genes,random_stage=1234, local.config)
umap.data <- as.data.frame(umap.ad$layout)
row.names(umap.data) <- row.names(mat_exp_alz_genes)
umap.data.covs <- merge(umap.data, covs2, by = "row.names")
umap.data.covs$Row.names <- NULL
row.names(umap.data.covs) <- umap.data.covs$mrna_idrownames(umap.data) <- rownames(mat_exp_alz_genes)
# Calcular la media y la desviación estándar para cada componente de t-SNE
mean.umap1 <- mean(umap.data[,1])
sd.umap1 <- sd(umap.data[,1])
mean.umap2 <- mean(umap.data[,2])
sd.umap2 <- sd(umap.data[,2])
# Identificar muestras a más de 2 desviaciones estándar de la media
outliers.umap1 <- umap.data[abs(umap.data[,1] - mean.umap1) > 2 * sd.umap1, ]
outliers.umap2 <-umap.data[abs(umap.data[,2] - mean.umap2) > 2 * sd.umap2, ]mUMAP1 <- mat_exp[rownames(mat_exp) %in% rownames(outliers.umap1),]
mUMAP2 <- mat_exp[rownames(mat_exp) %in% rownames(outliers.umap2),]
mUMAP12 <- mat_exp[(rownames(mat_exp) %in% rownames(outliers.umap1)) & (rownames(mat_exp) %in% rownames(outliers.umap2)),]
not_in_UMAP12 <- mat_exp[!((rownames(mat_exp) %in% rownames(outliers.umap1)) | (rownames(mat_exp) %in% rownames(outliers.umap2))),]for (i in rownames(covs2)){
if (i %in% rownames(mUMAP1) & !(i %in% rownames(mUMAP12))){
covs2[i, "sampleset_UMAP"] <- "mUMAP1"
}
if (i %in% rownames(mUMAP2) & !(i %in% rownames(mUMAP12))){
covs2[i, "sampleset_UMAP"] <- "mUMAP2"
}
if (i %in% rownames(mUMAP12)){
covs2[i, "sampleset_UMAP"] <- "mUMAP1 and mUMAP2"
}
if (!(i %in% rownames(mUMAP1)) & !(i %in% rownames(mUMAP2))){
covs2[i, "sampleset_UMAP"] <- "Not in both"
}
}
covs2$sampleset_UMAP <- as.factor(covs2$sampleset_UMAP)
data.frame(table(covs2$sampleset_UMAP))# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mUMAP1)), length(rownames(mUMAP2)), length(rownames(mUMAP12)))
# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_umap1 <- c(rownames(mUMAP1), rep("", max_length - length(rownames(mUMAP1))))
rownames_umap2 <- c(rownames(mUMAP2), rep("", max_length - length(rownames(mUMAP2))))
rownames_umap12 <- c(rownames(mUMAP12), rep("", max_length - length(rownames(mUMAP12))))
final_table <- data.frame(
mUMAP1 = rownames_umap1,
mUMAP2 = rownames_umap2,
mUMAP1_and_mUMAP2 = rownames_umap12
)
final_tableplots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
col_name <- colnames(covs2)[i]
if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric"){
next
}
if (class(covs2[[i]]) == "factor"){
p <- covs2 %>%
group_by(across(all_of(col_name)), sampleset_UMAP) %>%
summarise(count = n(), .groups = 'drop') %>%
filter(sampleset_UMAP == "mUMAP1" | sampleset_UMAP == "mUMAP2" | sampleset_UMAP == "mUMAP1 and mUMAP2") %>%
mutate(total = sum(count), percentage = (count / total) * 100) %>%
ggplot(aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_UMAP")) +
geom_bar(stat="identity", position=position_dodge()) +
labs(x = NULL, y = "%", fill = col_name) +
theme_minimal()
plots[[colnames(covs2)[i]]] <- p
}
}
grid.arrange(plots$study , plots$braaksc,plots$ceradsc, plots$cogdx, plots$apoe_genotype, plots$neuroStatus, ncol=2)colnames(umap.data) <- c("UMAP1.KEGG", "UMAP2.KEGG")
covs2 <- merge(covs2, umap.data, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULLplots.umap.kegg <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
# Crea una gráfica para cada variable
p <- ggplot(covs2, aes_string(x = "UMAP1.KEGG", y = "UMAP2.KEGG", color = names(covs2)[i])) +
geom_point(show.legend = TRUE) +
geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outliers.umap1) | mrna_id %in% rownames(outliers.umap2), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
color = "black",
max.overlaps = 30, # Reduce el número máximo de solapamientos
point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
size = 3,
fontface = "bold",
segment.size = 0.2, # LÃneas de guÃa más finas
segment.color = 'grey50',
max.segment.length = unit(0.5, "lines"), # LÃneas de guÃa más cortas
arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
theme_minimal() +
labs(title = names(covs2)[i],
x = "UMAP1.KEGG", y = "UMAP2.KEGG", color = names(covs2)[i]) +
theme(
legend.title = element_blank(),
legend.text = element_text(size = 12),
text = element_text(size = 12),
legend.key.size = unit(0.5, "cm"),
plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
)
plots.umap.kegg[[colnames(covs2)[i]]] <- p
}
grid.arrange(plots.umap.kegg$study, plots.umap.kegg$braaksc, ncol=1)geneset.AD.Andrews <- read_delim("~/Library/CloudStorage/OneDrive-UNIVERSIDADDEMURCIA/Documentos/Fernando/Master Bioinformatica/TFM/datos/geneset AD Andrews 2023 Lancet.csv", delim = ";", escape_double = FALSE, trim_ws = TRUE)
geneset.AD.Andrews <- unique(geneset.AD.Andrews$Gene)venn.plot <- venn.diagram(
x = list(GenesMatriz = colnames(mat_exp), GenesetAlzheimer = geneset.AD.Andrews),
category.names = c("Matrix Genes", "Geneset Alzheimer"),
filename = NULL,
output = FALSE, # Asegura que no se exporte a un archivo
fill = c("#00CED1", "#E9967A"),
cex = 1, # Aumenta el tamaño del texto
fontface = "bold",
cat.cex = 1, # Aumenta el tamaño del texto de las categorÃas
cat.fontface = "bold",
cat.default.pos = "text",
cat.pos = 25, #posicion de las categoricas
cat.dist = 0.1, #distancia de las categoricas
rotation.degree = 0,
margin = 0.1, # hacerla más pequeña
lwd=0.5,
lty = "dashed", # Estilo de lÃnea discontinua
edge.col = "grey", # Color de los bordes
main = "Alzheimer genes in exp matrix Andrews",
main.fontface= "bold",
main.cex = 2,
main.pos = c(0.5, 1)
)
grid.newpage() # Asegura que el lienzo esté limpio
grid.draw(venn.plot)Seleccionamos muestras que esten 2 veces la desviacion tipica para la PC1 y PC2
outlierspc1.Andrews <- as.data.frame(pca.AD.Andrews$x[abs(pca.AD.Andrews$x[,1]) > 2*(pca.AD.Andrews$sdev[1]),1])
outlierspc2.Andrews <- as.data.frame(pca.AD.Andrews$x[abs(pca.AD.Andrews$x[,2]) > 2*(pca.AD.Andrews$sdev[2]),2])Las ploteamos
Creamos los grupos de muestras seleccionados de los outliers de PC1 y/o PC2
mPC1.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outlierspc1.Andrews),]
mPC2.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outlierspc2.Andrews),]
mPC12.Andrews <- mat_exp[(rownames(mat_exp) %in% rownames(outlierspc2.Andrews)) & (rownames(mat_exp) %in% rownames(outlierspc1.Andrews)),]
not_in_PC12.Andrews <- mat_exp[!((rownames(mat_exp) %in% rownames(outlierspc2.Andrews)) | (rownames(mat_exp) %in% rownames(outlierspc1.Andrews))),]rownames(covs2) <- covs2$mrna_id
for (i in rownames(covs2)){
if (i %in% rownames(mPC1.Andrews) & !(i %in% rownames(mPC12.Andrews))){
covs2[i, "sampleset_PCA_Andrews"] <- "mPC1"
}
if (i %in% rownames(mPC2.Andrews) & !(i %in% rownames(mPC12.Andrews))){
covs2[i, "sampleset_PCA_Andrews"] <- "mPC2"
}
if (i %in% rownames(mPC12.Andrews)){
covs2[i, "sampleset_PCA_Andrews"] <- "mPC1 and mPC2"
}
if (!(i %in% rownames(mPC1.Andrews)) & !(i %in% rownames(mPC2.Andrews))){
covs2[i, "sampleset_PCA_Andrews"] <- "Not in both"
}
}
covs2$sampleset_PCA_Andrews <- as.factor(covs2$sampleset_PCA_Andrews)
data.frame(table(covs2$sampleset_PCA_Andrews))# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mPC1.Andrews)), length(rownames(mPC2.Andrews)), length(rownames(mPC12.Andrews)))
# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_mPC1.Andrews <- c(rownames(mPC1.Andrews), rep("", max_length - length(rownames(mPC1.Andrews))))
rownames_mPC2.Andrews <- c(rownames(mPC2.Andrews), rep("", max_length - length(rownames(mPC2.Andrews))))
rownames_mPC12.Andrews <- c(rownames(mPC12.Andrews), rep("", max_length - length(rownames(mPC12.Andrews))))
final_table <- data.frame(
mPC1 = rownames_mPC1.Andrews,
mPC2 = rownames_mPC2.Andrews,
mPC1_and_PC2 = rownames_mPC12.Andrews
)
final_tableMetemos una nueva variable en las covariables dependiendo de si son outliers de PC1 o del PC2 o de ninguno.
plots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
col_name <- colnames(covs2)[i]
if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric"){
next
}
if (class(covs2[[i]]) == "factor"){
p <- covs2 %>%
group_by(across(all_of(col_name)), sampleset_PCA_Andrews) %>%
summarise(count = n(), .groups = 'drop') %>%
mutate(total = sum(count), percentage = (count / total) * 100) %>%
filter(sampleset_PCA_Andrews == "mPC1" | sampleset_PCA_Andrews == "mPC2" | sampleset_PCA_Andrews == "mPC1 and mPC2") %>%
ggplot(aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_PCA_Andrews")) +
geom_bar(stat="identity", position=position_dodge()) +
labs(x = NULL, y = "%", fill = col_name) +
theme_minimal()
plots[[colnames(covs2)[i]]] <- p
}
}
grid.arrange(plots$study , plots$braaksc, plots$ceradsc, plots$cogdx, plots$apoe_genotype, plots$neuroStatus, ncol=2)pca.data.Andrews <- data.frame(pca.AD.Andrews$x)[,1:2]
colnames(pca.data.Andrews) <- c("PCA1.Andrews", "PCA2.Andrews")
covs2 <- merge(covs2, pca.data.Andrews, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULLplots.pca.Andrews <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
# Crea una gráfica para cada variable
p <- ggplot(covs2, aes_string(x = "PCA1.Andrews", y = "PCA2.Andrews", color = names(covs2)[i])) +
geom_point(show.legend = TRUE) +
geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outlierspc1.Andrews) | mrna_id %in% rownames(outlierspc2.Andrews), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
color = "black",
max.overlaps = 10, # Reduce el número máximo de solapamientos
point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
size = 3,
fontface = "bold",
segment.size = 0.2, # LÃneas de guÃa más finas
segment.color = 'grey50',
max.segment.length = unit(0.5, "lines"), # LÃneas de guÃa más cortas
arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
theme_minimal() +
labs(title = names(covs2)[i],
x = "PCA1.Andrews", y = "PCA2.Andrews", color = names(covs2)[i]) +
theme(
legend.title = element_blank(),
legend.text = element_text(size = 12),
text = element_text(size = 12),
legend.key.size = unit(0.5, "cm"),
plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
)
plots.pca.Andrews[[colnames(covs2)[i]]] <- p
}
grid.arrange(plots.pca.Andrews$study, plots.pca.Andrews$braaksc, ncol=1)set.seed(1234)
tsne.Andrews <- Rtsne(mat.expre.AD.Andrews, dims = 2, theta = 0.0)
tsne.data.Andrews <- as.data.frame(tsne.Andrews$Y)
row.names(tsne.data.Andrews) <- row.names(mat.expre.AD.Andrews)
tsne.Andrews.covs <- merge(tsne.data.Andrews, covs, by = "row.names")
tsne.Andrews.covs$Row.names <- NULL
row.names(tsne.Andrews.covs) <- tsne.Andrews.covs$mrna_idrownames(tsne.data.Andrews) <- rownames(mat.expre.AD.Andrews)
# Calcular la media y la desviación estándar para cada componente de t-SNE
mean.tsne1 <- mean(tsne.data.Andrews[,1])
sd.tsne1 <- sd(tsne.data.Andrews[,1])
mean.tsne2 <- mean(tsne.data.Andrews[,2])
sd.tsne2 <- sd(tsne.data.Andrews[,2])
# Identificar muestras a más de 2 desviaciones estándar de la media
outliers.tsne1.Andrews <- tsne.data.Andrews[abs(tsne.data.Andrews[,1] - mean.tsne1) > 2 * sd.tsne1, ]
outliers.tsne2.Andrews <-tsne.data.Andrews[abs(tsne.data.Andrews[,2] - mean.tsne2) > 2 * sd.tsne2, ]mtSNE1.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outliers.tsne1.Andrews),]
mtSNE2.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outliers.tsne2.Andrews),]
mtSNE12.Andrews <- mat_exp[(rownames(mat_exp) %in% rownames(outliers.tsne1.Andrews)) & (rownames(mat_exp) %in% rownames(outliers.tsne2.Andrews)),]
not_in_tSNE12.Andrews <- mat_exp[!((rownames(mat_exp) %in% rownames(outliers.tsne1.Andrews)) | (rownames(mat_exp) %in% rownames(outliers.tsne2.Andrews))),]for (i in rownames(covs2)){
if (i %in% rownames(mtSNE1.Andrews) & !(i %in% rownames(mtSNE12.Andrews))){
covs2[i, "sampleset_tSNE_Andrews"] <- "mtSNE1"
}
if (i %in% rownames(mtSNE2.Andrews) & !(i %in% rownames(mtSNE12.Andrews))){
covs2[i, "sampleset_tSNE_Andrews"] <- "mtSNE2"
}
if (i %in% rownames(mtSNE12.Andrews)){
covs2[i, "sampleset_tSNE_Andrews"] <- "mtSNE1 and mtSNE2"
}
if (!(i %in% rownames(mtSNE1.Andrews)) & !(i %in% rownames(mtSNE2.Andrews))){
covs2[i, "sampleset_tSNE_Andrews"] <- "Not in both"
}
}
covs2$sampleset_tSNE_Andrews <- as.factor(covs2$sampleset_tSNE_Andrews)
data.frame(table(covs2$sampleset_tSNE_Andrews))# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mtSNE1.Andrews)), length(rownames(mtSNE2.Andrews)), length(rownames(mtSNE12.Andrews)))
# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_tsne1.Andrews <- c(rownames(mtSNE1.Andrews), rep("", max_length - length(rownames(mtSNE1.Andrews))))
rownames_tsne2.Andrews <- c(rownames(mtSNE2.Andrews), rep("", max_length - length(rownames(mtSNE2.Andrews))))
rownames_tsne12.Andrews <- c(rownames(mtSNE12.Andrews), rep("", max_length - length(rownames(mtSNE12.Andrews))))
final_table <- data.frame(
mtSNE1 = rownames_tsne1.Andrews,
mtSNE2 = rownames_tsne2.Andrews,
mtSNE1_and_mtSNE2 = rownames_tsne12.Andrews
)
final_tableplots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
col_name <- colnames(covs2)[i]
if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric" ){
next
}
if (class(covs2[[i]]) == "factor"){
p <- covs2 %>%
group_by(across(all_of(col_name)), sampleset_tSNE_Andrews) %>%
summarise(count = n(), .groups = 'drop') %>%
filter(sampleset_tSNE_Andrews == "mtSNE1" | sampleset_tSNE_Andrews == "mtSNE2" | sampleset_tSNE_Andrews == "mtSNE1 and mtSNE2") %>%
mutate(total = sum(count), percentage = (count / total) * 100) %>%
ggplot(aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_tSNE_Andrews")) +
geom_bar(stat="identity", position=position_dodge()) +
labs(x = NULL, y = "%", fill = col_name) +
theme_minimal()
plots[[colnames(covs2)[i]]] <- p
}
}
grid.arrange(plots$study , plots$braaksc, plots$ceradsc, plots$cogdx, plots$apoe_genotype, plots$neuroStatus, ncol=2)colnames(tsne.data.Andrews) <- c("tSNE1.Andrews", "tSNE2.Andrews")
covs2 <- merge(covs2, tsne.data.Andrews, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULLplots.tsne.Andrews <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
# Crea una gráfica para cada variable
p <- ggplot(covs2, aes_string(x = "tSNE1.Andrews", y = "tSNE2.Andrews", color = names(covs2)[i])) +
geom_point(show.legend = TRUE) +
geom_text_repel(aes(label=ifelse((covs2$mrna_id %in% rownames(outliers.tsne1.Andrews) | covs2$mrna_id %in% rownames(outliers.tsne2.Andrews)), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
color = "black",
max.overlaps = 30, # Reduce el número máximo de solapamientos
point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
size = 3,
fontface = "bold",
segment.size = 0.2, # LÃneas de guÃa más finas
segment.color = 'grey50',
max.segment.length = unit(0.5, "lines"), # LÃneas de guÃa más cortas
arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
theme_minimal() +
labs(title = names(covs2)[i],
x = "tSNE1.Andrews", y = "tSNE2.Andrews", color = names(covs2)[i]) +
theme(
legend.title = element_blank(),
legend.text = element_text(size = 12),
text = element_text(size = 12),
legend.key.size = unit(0.5, "cm"),
plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
)
plots.tsne.Andrews[[colnames(covs2)[i]]] <- p
}
grid.arrange(plots.tsne.Andrews$study, plots.tsne.Andrews$braaksc, ncol=1)local.config <- umap.defaults
# local.config$n_neighbors <- 4
# local.config$n_components <- 2
# local.config$n_epochs <- 100
# local.config$metric<- "euclidean"
set.seed(1234)
umap.ad.Andrews <- umap(mat.expre.AD.Andrews,random_stage=1234, local.config)
umap.data.Andrews <- as.data.frame(umap.ad.Andrews$layout)
row.names(umap.data.Andrews) <- row.names(mat.expre.AD.Andrews)
umap.data.covs.Andrews <- merge(umap.data.Andrews, covs2, by = "row.names")
umap.data.covs.Andrews$Row.names <- NULL
row.names(umap.data.covs.Andrews) <- umap.data.covs.Andrews$mrna_idrownames(umap.data.Andrews) <- rownames(mat.expre.AD.Andrews)
# Calcular la media y la desviación estándar para cada componente de t-SNE
mean.umap1 <- mean(umap.data.Andrews[,1])
sd.umap1 <- sd(umap.data.Andrews[,1])
mean.umap2 <- mean(umap.data.Andrews[,2])
sd.umap2 <- sd(umap.data.Andrews[,2])
# Identificar muestras a más de 2 desviaciones estándar de la media
outliers.umap1.Andrews <- umap.data.Andrews[abs(umap.data.Andrews[,1] - mean.umap1) > 2 * sd.umap1, ]
outliers.umap2.Andrews <-umap.data.Andrews[abs(umap.data.Andrews[,2] - mean.umap2) > 2 * sd.umap2, ]mUMAP1.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outliers.umap1.Andrews),]
mUMAP2.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outliers.umap2.Andrews),]
mUMAP12.Andrews <- mat_exp[(rownames(mat_exp) %in% rownames(outliers.umap1.Andrews)) & (rownames(mat_exp) %in% rownames(outliers.umap2.Andrews)),]
not_in_UMAP12.Andrews <- mat_exp[!((rownames(mat_exp) %in% rownames(outliers.umap1.Andrews)) | (rownames(mat_exp) %in% rownames(outliers.umap2.Andrews))),]for (i in rownames(covs2)){
if (i %in% rownames(mUMAP1.Andrews) & !(i %in% rownames(mUMAP12.Andrews))){
covs2[i, "sampleset_UMAP_Andrews"] <- "mUMAP1"
}
if (i %in% rownames(mUMAP2.Andrews) & !(i %in% rownames(mUMAP12.Andrews))){
covs2[i, "sampleset_UMAP_Andrews"] <- "mUMAP2"
}
if (i %in% rownames(mUMAP12.Andrews)){
covs2[i, "sampleset_UMAP_Andrews"] <- "mUMAP1 and mUMAP2"
}
if (!(i %in% rownames(mUMAP1.Andrews)) & !(i %in% rownames(mUMAP2.Andrews))){
covs2[i, "sampleset_UMAP_Andrews"] <- "Not in both"
}
}
covs2$sampleset_UMAP_Andrews <- as.factor(covs2$sampleset_UMAP_Andrews)
data.frame(table(covs2$sampleset_UMAP_Andrews))# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mUMAP1.Andrews)), length(rownames(mUMAP2.Andrews)), length(rownames(mUMAP12.Andrews)))
# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_umap1.Andrews <- c(rownames(mUMAP1.Andrews), rep("", max_length - length(rownames(mUMAP1.Andrews))))
rownames_umap2.Andrews <- c(rownames(mUMAP2.Andrews), rep("", max_length - length(rownames(mUMAP2.Andrews))))
rownames_umap12.Andrews <- c(rownames(mUMAP12.Andrews), rep("", max_length - length(rownames(mUMAP12.Andrews))))
final_table <- data.frame(
mUMAP1 = rownames_umap1.Andrews,
mUMAP2 = rownames_umap2.Andrews,
mUMAP1_and_mUMAP2 = rownames_umap12.Andrews
)
final_tableplots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
col_name <- colnames(covs2)[i]
if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric"){
next
}
if (class(covs2[[i]]) == "factor"){
p <- covs2 %>%
group_by(across(all_of(col_name)), sampleset_UMAP_Andrews) %>%
summarise(count = n(), .groups = 'drop') %>%
filter(sampleset_UMAP_Andrews == "mUMAP1" | sampleset_UMAP_Andrews == "mUMAP2" | sampleset_UMAP_Andrews == "mUMAP1 and mUMAP2") %>%
mutate(total = sum(count), percentage = (count / total) * 100) %>%
ggplot(aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_UMAP_Andrews")) +
geom_bar(stat="identity", position=position_dodge()) +
labs(x = NULL, y = "%", fill = col_name) +
theme_minimal()
plots[[colnames(covs2)[i]]] <- p
}
}
grid.arrange(plots$study , plots$braaksc,plots$ceradsc, plots$cogdx, plots$apoe_genotype, plots$neuroStatus, ncol=2)colnames(umap.data.Andrews) <- c("UMAP1.Andrews", "UMAP2.Andrews")
covs2 <- merge(covs2, umap.data.Andrews, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULLplots.umap.Andrews <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
# Crea una gráfica para cada variable
p <- ggplot(covs2, aes_string(x = "UMAP1.Andrews", y = "UMAP2.Andrews", color = names(covs2)[i])) +
geom_point(show.legend = TRUE) +
geom_text_repel(aes(label=ifelse((covs2$mrna_id %in% rownames(outliers.umap1.Andrews) | covs2$mrna_id %in% rownames(outliers.umap2.Andrews)), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
color = "black",
max.overlaps = 30,
point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
size = 3,
fontface = "bold",
segment.size = 0.2, # LÃneas de guÃa más finas
segment.color = 'grey50',
max.segment.length = unit(0.5, "lines"), # LÃneas de guÃa más cortas
arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
theme_minimal() +
labs(title = names(covs2)[i],
x = "UMAP1.Andrews", y = "UMAP2.Andrews", color = names(covs2)[i]) +
theme(
legend.title = element_blank(),
legend.text = element_text(size = 12),
text = element_text(size = 12),
legend.key.size = unit(0.5, "cm"),
plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
)
plots.umap.Andrews[[colnames(covs2)[i]]] <- p
}
grid.arrange(plots.umap.Andrews$study, plots.umap.Andrews$braaksc, ncol=1)grid.arrange(plots.pca.kegg$neuroStatus, plots.pca.Andrews$neuroStatus,plots.tsne.kegg$neuroStatus, plots.tsne.Andrews$neuroStatus, plots.umap.kegg$neuroStatus, plots.umap.Andrews$neuroStatus,ncol = 2)all.outliers <- list(PCA.KEGG = c(rownames(outlierspc1_scaled), rownames(outlierspc2_scaled)),
tSNE.KEGG = c(rownames(outliers.tsne1), rownames(outliers.tsne2)),
UMAP.KEGG = c(rownames(outliers.umap1), rownames(outliers.umap2)),
PCA.Andrews = c(rownames(outlierspc1.Andrews), rownames(outlierspc2.Andrews)),
tSNE.Andrews = c(rownames(outliers.tsne1.Andrews), rownames(outliers.tsne2.Andrews)),
UMAP.Andrews = c(rownames(outliers.umap1.Andrews), rownames(outliers.umap2.Andrews))
)
max_length <- max(sapply(all.outliers, length))
# Función para normalizar las longitudes de las listas
normalize_length <- function(x, max_length) {
length(x) <- max_length # Esto rellenará con NA si la lista es más corta que max_length
x
}
# Aplicar la normalización a cada lista en 'all.outliers'
all.outliers <- lapply(all.outliers, normalize_length, max_length = max_length)
# Convertir la lista normalizada en un data.frame
all.outliers <- data.frame(all.outliers)
all.outliers# Convertimos el data frame a un vector para hacer la tabla
all_values.outliers <- unlist(all.outliers, use.names = FALSE)
# Usamos table() para contar las frecuencias de cada muestra
table_values <- table(all_values.outliers)
# Filtramos para mostrar solo las muestras que se repiten
repeated_samples <- names(table_values[table_values > 1])
# Unlist with names to track the original column of each value
all_values_named <- unlist(all.outliers, use.names = TRUE)
# Extract unique non-NA samples
unique_samples <- unique(all_values_named[!is.na(all_values_named)])
# Create an empty matrix to fill with presence (1) or absence (0)
incidence_matrix <- matrix(0, nrow = length(unique_samples), ncol = length(all.outliers))
rownames(incidence_matrix) <- unique_samples
colnames(incidence_matrix) <- names(all.outliers)
# Populate the incidence matrix
for (col in names(all.outliers)) {
present_samples <- all.outliers[[col]][!is.na(all.outliers[[col]])] # Extract non-NA samples from column
incidence_matrix[rownames(incidence_matrix) %in% present_samples, col] <- 1
}
# Melt the incidence matrix for ggplot
melted_incidence_matrix <- melt(incidence_matrix, varnames = c("sample", "column"), value.name = "presence")
colnames(melted_incidence_matrix)[3] <- "presence"
# Convertir la matriz de incidencia en un data frame
incidence_df <- as.data.frame(incidence_matrix)
# Crear un data frame para contar y listar las columnas de aparición
summary_df <- data.frame(
sample = rownames(incidence_df),
count = rowSums(incidence_df),
columns = apply(incidence_df, 1, function(row) paste(names(df)[as.logical(row)], collapse = ", "))
)
# Ordenar el summary_df para mejorar la visualización
summary_df <- summary_df %>%
arrange(desc(count)) %>%
mutate(sample = factor(sample, levels = sample))
# Crear el gráfico de barras
ggplot(summary_df, aes(x = sample, y = count, fill = count)) + # Mostrar solo las primeras 20 muestras para claridad
geom_bar(stat = "identity", color = "black", show.legend = FALSE) +
coord_flip() + # Invertir el gráfico para mejor visualización de etiquetas
labs(x = "Sample", y = "Count of Appearances", title = "Sample Appearances Across dimensions") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
axis.title.x = element_text(size = 12, face = "bold"),
axis.title.y = element_text(size = 12, face = "bold"),
plot.title = element_text(size = 14, face = "bold", hjust = 0.5))# Calcular el número total de presencias para cada muestra
total_presences <- rowSums(incidence_matrix)
melted_incidence_matrix$total_presence <- total_presences[as.character(melted_incidence_matrix$sample)]
melted_incidence_matrix$column <- gsub(pattern = "outliers.", replacement = "", x = melted_incidence_matrix$column)
melted_incidence_matrix$sample <- gsub(pattern = "_.*", replacement = "", x = melted_incidence_matrix$sample)
filtered_data <- melted_incidence_matrix %>%
filter(presence == 1) %>%
arrange(desc(total_presence)) %>%
mutate(sample = factor(sample, levels = unique(sample)))
ggplot(filtered_data, aes(x = column, y = sample, size = total_presence)) +
geom_point(color = "steelblue") +
scale_size(range = c(2, 6), guide = "none") + # Adjustable size scale with no size legend
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "right") +
labs(y = "Sample", x= "Dimension", title = "Presence of Samples Across Dimensions", size = "Total Presence")venn.plot <- venn.diagram(
x = lista.outliers[1:3],
category.names = c("PCA", "tSNE", "UMAP"),
filename = NULL,
output = FALSE,
fill = c("#440154ff", '#21908dff', '#fde725ff'),
cex = 1, # Aumenta el tamaño del texto
fontface = "bold",
cat.cex = 1, # Aumenta el tamaño del texto de las categorÃas
cat.fontface = "bold",
cat.default.pos = "text",
cat.pos = c(10, -10, 0), #posicion de las categoricas
cat.dist = c(0.3, 0.35, 0.28), #distancia de las categoricas
rotation.degree = 0,
margin = 0.1, # hacerla más pequeña
lwd=0.5,
lty = "dashed", # Estilo de lÃnea discontinua
edge.col = "grey", # Color de los bordes
main = paste0("Diagrama Venn KEGG geneset"),
main.fontface= "bold",
main.cex = 2,
main.pos = c(0.5, 0.9)
)
interseccion_ABC <- Reduce(intersect, lista.outliers[1:3])
grid.newpage() # Asegura que el lienzo esté limpio
grid.draw(venn.plot)
grid.text(label = paste(gsub("_.*","",interseccion_ABC), collapse = "\n"),
x = 0.5,
y = 0.3,
gp = gpar(fontsize = 8,
col = "black",
fontface = "italic"))
interseccion_AB <- Reduce(intersect, lista.outliers[1:2])
grid.text(label = paste(gsub("_.*","",interseccion_AB[!(interseccion_AB %in% interseccion_ABC)]), collapse = "\n"),
x = 0.7,
y = 0.3,
gp = gpar(fontsize = 8,
col = "black",
fontface = "italic"))
interseccion_BC <- Reduce(intersect, lista.outliers[2:3])
grid.text(label = paste(gsub("_.*","",interseccion_BC[!(interseccion_BC %in% interseccion_ABC)]), collapse = "\n"),
x = 0.32,
y = 0.35,
gp = gpar(fontsize = 8,
col = "black",
fontface = "italic"))venn.plot <- venn.diagram(
x = lista.outliers[4:6],
category.names = c("PCA", "tSNE", "UMAP"),
filename = NULL,
output = FALSE,
fill = c("#440154ff", '#21908dff', '#fde725ff'),
cex = 1, # Aumenta el tamaño del texto
fontface = "bold",
cat.cex = 1, # Aumenta el tamaño del texto de las categorÃas
cat.fontface = "bold",
cat.default.pos = "text",
cat.pos = c(10, -10, 0), #posicion de las categoricas
cat.dist = c(0.35, 0.1, 0.3), #distancia de las categoricas
rotation.degree = 0,
margin = 0.1, # hacerla más pequeña
lwd=0.5,
lty = "dashed", # Estilo de lÃnea discontinua
edge.col = "grey", # Color de los bordes
main = paste0("Diagrama Venn Andrews geneset"),
main.fontface= "bold",
main.cex = 2,
main.pos = c(0.5, 0.9)
)
interseccion_ABC <- Reduce(intersect, lista.outliers[4:6])
grid.newpage() # Asegura que el lienzo esté limpio
grid.draw(venn.plot)
grid.text(label = paste(gsub("_.*","",interseccion_ABC), collapse = "\n"),
x = 0.58,
y = 0.35,
gp = gpar(fontsize = 8,
col = "black",
fontface = "italic"))
interseccion_AC <- Reduce(intersect, lista.outliers[c(4,6)])
grid.text(label = paste(gsub("_.*","",interseccion_AC[!(interseccion_AC %in% interseccion_ABC)]), collapse = "\n"),
x = 0.42,
y = 0.35,
gp = gpar(fontsize = 8,
col = "black",
fontface = "italic"))
interseccion_BC <- Reduce(intersect, lista.outliers[5:6])
grid.text(label = paste(gsub("_.*","",interseccion_BC[!(interseccion_BC %in% interseccion_ABC)]), collapse = "\n"),
x = 0.68,
y = 0.4,
gp = gpar(fontsize = 8,
col = "black",
fontface = "italic"))venn.plot <- venn.diagram(
x = lista.outliers[c(1,4)],
category.names = c("KEGG", "Andrews"),
filename = NULL,
output = FALSE,
fill = c("#440154ff", '#fde725ff'),
cex = 1, # Aumenta el tamaño del texto
fontface = "bold",
cat.cex = 1, # Aumenta el tamaño del texto de las categorÃas
cat.fontface = "bold",
cat.default.pos = "text",
cat.pos = c(0, 0), #posicion de las categoricas
cat.dist = c(0.1, 0.1), #distancia de las categoricas
rotation.degree = 0,
margin = 0.1, # hacerla más pequeña
lwd=0.5,
lty = "dashed", # Estilo de lÃnea discontinua
edge.col = "grey", # Color de los bordes
main = paste0("Diagrama Venn PCA geneset"),
main.fontface= "bold",
main.cex = 2,
main.pos = c(0.5, 0.9)
)
interseccion_AB <- Reduce(intersect, lista.outliers[c(1,4)])
grid.newpage() # Asegura que el lienzo esté limpio
grid.draw(venn.plot)
grid.text(label = paste(gsub("_.*","",interseccion_AB), collapse = "\n"),
x = 0.58,
y = 0.4,
gp = gpar(fontsize = 8,
col = "black",
fontface = "italic"))venn.plot <- venn.diagram(
x = lista.outliers[c(2,5)],
category.names = c("KEGG", "Andrews"),
filename = NULL,
output = FALSE,
fill = c("#440154ff", '#fde725ff'),
cex = 1, # Aumenta el tamaño del texto
fontface = "bold",
cat.cex = 1, # Aumenta el tamaño del texto de las categorÃas
cat.fontface = "bold",
cat.default.pos = "text",
cat.pos = c(0, 0), #posicion de las categoricas
cat.dist = c(0.1, 0.1), #distancia de las categoricas
rotation.degree = 0,
margin = 0.1, # hacerla más pequeña
lwd=0.5,
lty = "dashed", # Estilo de lÃnea discontinua
edge.col = "grey", # Color de los bordes
main = paste0("Diagrama Venn tSNE geneset"),
main.fontface= "bold",
main.cex = 2,
main.pos = c(0.5, 0.9)
)
interseccion_AB <- Reduce(intersect, lista.outliers[c(2,5)])
grid.newpage() # Asegura que el lienzo esté limpio
grid.draw(venn.plot)
grid.text(label = paste(gsub("_.*","",interseccion_AB), collapse = "\n"),
x = 0.63,
y = 0.4,
gp = gpar(fontsize = 8,
col = "black",
fontface = "italic"))venn.plot <- venn.diagram(
x = lista.outliers[c(3,6)],
category.names = c("KEGG", "Andrews"),
filename = NULL,
output = FALSE,
fill = c("#440154ff", '#fde725ff'),
cex = 1, # Aumenta el tamaño del texto
fontface = "bold",
cat.cex = 1, # Aumenta el tamaño del texto de las categorÃas
cat.fontface = "bold",
cat.default.pos = "text",
cat.pos = c(0, 0), #posicion de las categoricas
cat.dist = c(0.1, 0.1), #distancia de las categoricas
rotation.degree = 0,
margin = 0.1, # hacerla más pequeña
lwd=0.5,
lty = "dashed", # Estilo de lÃnea discontinua
edge.col = "grey", # Color de los bordes
main = paste0("Diagrama Venn UMAP geneset"),
main.fontface= "bold",
main.cex = 2,
main.pos = c(0.5, 0.9)
)
interseccion_AB <- Reduce(intersect, lista.outliers[c(3,6)])
grid.newpage() # Asegura que el lienzo esté limpio
grid.draw(venn.plot)
grid.text(label = paste(gsub("_.*","",interseccion_AB), collapse = "\n"),
x = 0.52,
y = 0.3,
gp = gpar(fontsize = 8,
col = "black",
fontface = "italic"))